Cells & Genes filtering
Keep the 'high quality' cells identified from the preliminary analysis / pre-processing steps.
# Import useful modules
import numpy as np
import pandas as pd
import scanpy as sc
import os
#import igraph
import matplotlib.pyplot as plt
import seaborn
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=90)
adata = sc.read_h5ad('/data/PreProcessed_dataset.h5ad')
metadata = pd.read_csv('/Data/fastMNN_meta.txt', sep = '\t')
metadata.index.tolist()[0:10]
metadata.index = metadata['index']
metadata.index.tolist()[0:10]
## Keep High Quality cells
list_of_cell_names = metadata.index.tolist()
adata = adata[list_of_cell_names, ]
adata.shape
# Remove unwanted genes (mitochondrial genes)
genes_names = adata.var.index.tolist()
keep_genes = [i for i in genes_names if not i.startswith('MT-')]
adata = adata[:,keep_genes]
adata.raw = adata
adata.obs['n_counts_log'] = np.log10(adata.obs['n_counts'])
sc.pp.filter_genes(adata, min_cells=100)
adata.X.shape
sc.pp.highly_variable_genes(adata, min_mean=0.01, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
np.sum(adata.var['highly_variable'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
# Load fastMNN PCA (Data integration results)
pca = pd.read_csv('/Data/fastMNN_PCA.txt', sep = '\t')
pca.index = metadata.index
adata.obsm['X_pca'].shape
pca_numpy = pca.as_matrix()
pca_numpy.shape
adata.obsm['X_pca'] = pca_numpy
sc.pl.pca_variance_ratio(adata, log = False)
sc.pl.pca_loadings(adata, components=list(range(0,16)))
sc.pp.neighbors(adata, n_neighbors=100, n_pcs=12)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['manip'],edges = False)
sc.pl.umap(adata, color=['donor','position','method'],edges = False)
sc.pl.umap(adata, color=['n_counts_log', 'n_genes', 'percent_mito', "XIST"],edges = False)
adata.obs['previous_cell_type'] = metadata['cell_type']
sc.pl.umap(adata, color=['previous_cell_type'],edges = False)
sc.tl.tsne(adata, n_pcs = 8, perplexity = 100)
sc.pl.tsne(adata, color=['manip'])
sc.pl.tsne(adata, color=['donor','position','method'])
sc.pl.tsne(adata, color=['n_counts_log', 'n_genes', 'percent_mito', "XIST"])
sc.pl.tsne(adata, color=['previous_cell_type'])
communities, graph, Q = sc.external.tl.phenograph(adata.obsm['X_pca'][:,0:12], k=100)
adata.obs['phenograph'] = pd.Categorical(communities)
pd.Categorical(communities)
sc.pl.umap(adata, color=['phenograph', 'manip'])
communities, graph, Q = sc.external.tl.phenograph(adata.obsm['X_pca'][:,0:25], k=10)
adata.obs['phenograph_2'] = pd.Categorical(communities)
pd.Categorical(communities)
sc.pl.umap(adata, color=['phenograph_2', 'phenograph'])
mean_cellType = np.empty((len(adata.obs['phenograph'].unique()), adata.raw.shape[1]),
dtype=float, order='C')
raw_adata = adata.raw.X.toarray()
for i in range(0, len(adata.obs['phenograph'].unique())):
mean_cellType[i,:] = np.mean(raw_adata[adata.obs['phenograph'] == adata.obs['phenograph'].unique()[i], :], axis = 0)
mean_df = pd.DataFrame(np.corrcoef(mean_cellType), index = adata.obs['phenograph'].unique(), columns = adata.obs['phenograph'].unique())
ax = seaborn.clustermap(mean_df, cmap="jet").fig.suptitle('Phenograph')
mean_cellType = np.empty((len(adata.obs['phenograph_2'].unique()), adata.raw.shape[1]),
dtype=float, order='C')
for i in range(0, len(adata.obs['phenograph_2'].unique())):
mean_cellType[i,:] = np.mean(raw_adata[adata.obs['phenograph_2'] == adata.obs['phenograph_2'].unique()[i], :], axis = 0)
mean_df = pd.DataFrame(np.corrcoef(mean_cellType), index = adata.obs['phenograph_2'].unique(), columns = adata.obs['phenograph_2'].unique())
seaborn.set(font_scale=0.5)
ax = seaborn.clustermap(mean_df, cmap="jet").fig.suptitle('Phenograph_2')
seaborn.set_style("white")
Marker genes identified during the individual analysis of each sample.
sc.pl.umap(adata, color=['MKI67', 'TOP2A', 'UBE2C', 'CDK1'], color_map="jet") # Cycling Basal
sc.pl.umap(adata, color=['KRT5', 'DLK2', 'MIR205HG', 'TP63'], color_map="jet") # Basal
sc.pl.umap(adata, color=['SERPINB3', 'KRT19', 'MT1X', 'NTS'], color_map="jet") # Suprabasal
sc.pl.umap(adata, color=['SCGB1A1', 'BPIFA1', 'VMO1', 'MSMB'], color_map="jet") # Secretory
sc.pl.umap(adata, color=['MUC5AC', 'MUC5B', 'SCGB3A1'], color_map="jet") # Secretory
sc.pl.umap(adata, color=['LYZ', 'PIP', 'FOLR1', 'AZGP1'], color_map="jet") # Glandular
sc.pl.umap(adata, color=['CDC20Bshort', 'CCNO', 'HES6', 'CCDC67'], color_map="jet") # Deuterosomal
sc.pl.umap(adata, color=['FOXJ1', 'TPPP3', 'PIFO', 'IFT57'], color_map="jet") # Multiciliated
sc.pl.umap(adata, color=['VIM', 'DARC', 'AQP1', 'GNG11'], color_map="jet") # Endothelial
sc.pl.umap(adata, color=['TAGLN', 'ACTA2', 'MYL9', 'CALD1'], color_map="jet") # Smooth muscle
sc.pl.umap(adata, color=['FBLN1', 'DPT', 'LUM', 'COL1A1'], color_map="jet") # Fibroblast
sc.pl.umap(adata, color=['TYROBP', 'CD68', 'LST1', 'AIF1'], color_map="jet") # Macrophage
sc.pl.umap(adata, color=['LGALS2', 'VCAN', 'FCN1', 'CFP'], color_map="jet") # Monocytes
sc.pl.umap(adata, color=['CCL17', 'IL1R2', 'S100B', 'FCGR2B'], color_map="jet") # Dendritic cells
sc.pl.umap(adata, color=['IGJ', 'CD79A', 'PTPN7', 'IGLL5'], color_map="jet") # B cells
sc.pl.umap(adata, color=['NKG7', 'CD3D', 'GZMA', 'LTB'], color_map="jet") # LT/NK
sc.pl.umap(adata, color=['TPSAB1', 'SLC18A2', 'CPA3', 'LTC4S'], color_map="jet") # Mast cells
sc.pl.umap(adata, color=['ASCL3', 'CFTR', 'PCSK1N'], color_map="jet") # Ionocyte / PNEC - , 'SCG5'
Wilcoxon's rank test (one-VS-all differential expression testing design)
sc.settings.set_figure_params(dpi=130)
sc.pl.umap(adata, color=['phenograph'], legend_loc = 'on data', legend_fontsize = 6)
sc.tl.rank_genes_groups(adata, groupby = 'phenograph', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
sc.tl.rank_genes_groups(adata, groupby = 'phenograph_2', method='wilcoxon')
sc.pl.umap(adata, color=['phenograph_2'], legend_loc = 'on data', legend_fontsize = 6)
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
new_cluster_names = {
'0':'Basal', '1':'Multiciliated', '2':'Basal', '3':'Suprabasal', '4':'Secretory', # 0-4
'5':'Secretory N','6':'Suprabasal', '7':'Suprabasal', '8':'Suprabasal N', '9':'Serous', # 5-9
'10':'Endothelial', '11':'Macrophage', '12':'Monocyte', '13':'Smooth muscle', '14':'Secretory', # 10-14
'15':'LT/NK', '16':'Secretory N', '17':'SMG Goblet', '18':'Suprabasal N', '19':'Deuterosomal', # 15-19
'20':'Rare cells', '21':'Mast cells', '22':'B cells'}
new_small_cluster_names = {
'19':'LT/NK', '18':'Monocyte', '32':'Mast cells', '33':'B cells', '36':'Plasma cells',
'26':'Smooth muscle', '29':'Fibroblast', '34':'Rare cells',
'35': 'doublet', '40':'doublet', '41':'doublet'}
vect = []
for i in range(0, len(adata.obs['phenograph'])):
if (str(adata.obs['phenograph_2'][i]) in new_small_cluster_names.keys()) :
vect = vect + [new_small_cluster_names[str(adata.obs['phenograph_2'][i])]]
else :
vect = vect + [new_cluster_names[str(adata.obs['phenograph'][i])]]
adata.obs['cell_type'] = vect
## Last identification of peculiar cells supposed to be doublet cells.
list_cell_names = adata.obs.loc[adata.obs['cell_type'] != 'doublet', ].index.tolist()
adata = adata[list_cell_names, ]
adata.uns['cell_type_colors'] = [ '#7e6384',# B cells '#7e6384'
'#A31E22', #, Basal ,
#'#db3d42',# Basal D
#'#F3766E', # Cycling Basal,
'#2da9d2', # Deuterosomal
'#281c1c', '#aea9ce', # Endothelial, Fibroblast
#'#006600', # Goblet
'#ff1aff', '#c997e7', #LT/NK, Macrophage
'#e366ff', #Mast cells
'#ca81ca', #Monocyte
'#466cb9', #'#90a7d5',# Multiciliated
"#ca81ca", #Plasma cells
'#b34d6d', # Rare cells
'#5edbbd', # SMG Goblet
'#53c653', '#a9c653', # Secretory
'#3e937f', # Serous
'#7c5959', # Smooth muscle
'#FCCC0A', '#e0c96c'] # Suprabasal, '#ecec13'
sc.pl.umap(adata, color=['cell_type'])
cell_type_nb = {}
list_cell_type = adata.obs['cell_type'].unique().tolist()
list_cell_type.sort()
for i in range(0, len(adata.obs['cell_type'].unique().tolist())):
cell_type_nb[list_cell_type[i]] = i
cell_type_nb
vect = []
for i in range(0, len(adata.obs['cell_type'])):
vect = vect + [str(cell_type_nb[str(adata.obs['cell_type'][i])])]
adata.obs['cell_type_nb'] = vect
adata.uns['cell_type_nb_colors'] = adata.uns['cell_type_colors']
sc.pl.umap(adata, color=['cell_type_nb'])
sc.tl.rank_genes_groups(adata, groupby = 'cell_type_nb', method='wilcoxon', n_genes = 1000)
adata.write('/Data/Annotated_dataset_v1.h5ad')
inverted_cell_type_nb = dict([[v,k] for k,v in cell_type_nb.items()])
clusters = []
genes = []
logFC = []
score = []
pvals = []
pvals_adj = []
for cl in inverted_cell_type_nb.keys():
clusters = clusters + ([inverted_cell_type_nb[cl]]*len(adata.uns['rank_genes_groups']['names'][str(cl)]))
genes = genes + adata.uns['rank_genes_groups']['names'][str(cl)].tolist()
logFC = logFC + adata.uns['rank_genes_groups']['logfoldchanges'][str(cl)].tolist()
score = score + adata.uns['rank_genes_groups']['scores'][str(cl)].tolist()
pvals = pvals + adata.uns['rank_genes_groups']['pvals'][str(cl)].tolist()
pvals_adj = pvals_adj + adata.uns['rank_genes_groups']['pvals_adj'][str(cl)].tolist()
markers = pd.DataFrame(data = {'clusters': clusters,
'genes':genes,
'logFC':logFC,
'score':score,
'pvals':pvals,
'pvals_adj':pvals_adj,
})
markers.to_csv(path_or_buf = '/Data/markers_v1.tsv',
sep = '\t', index = False)